#"Measuring Canonicity"
#Code for replicating tables and figures

library(dplyr)
library(tidyverse) 
library(ggplot2)
library(proxy)
library(vegan)
library(reshape2)

## Load the dataset  

reading_lists <- read_csv("Hispanic_canon.csv")

list_size<-reading_lists %>% select(ID) %>% group_by(ID) %>% summarize(SIZE=n())  #size of samples

#create a new dataset by gathering gender info for figure 1.

gender<-reading_lists %>% group_by(ID) %>% filter(GENDER=='F') %>% summarize (TOT_F= n())  %>% ungroup() %>% mutate(SIZE=list_size$SIZE) %>% mutate(AVG=(TOT_F/SIZE)*100)  %>% mutate(Residuals=AVG-((sum(TOT_F)/sum(SIZE))*100)) 


#FIGURE 1  
theme_set(theme_bw())  # pre-set the bw theme.
ggplot(gender, aes(x=SIZE, y=Residuals, label=ID)) + 
  geom_point(stat='identity', fill="black", size=6)  +
  geom_segment(aes(y = 0, 
                   x = `SIZE`, 
                   yend = Residuals, 
                   xend = `SIZE`), 
               color = "black") +
  geom_text(color="white", size=2) +
  labs(title="Gender Gap", 
       subtitle="with sample size",
       y="Residuals", 
       x='Sample size') + 
  ylim(-13, 13) +
  coord_flip()




#TABLE 1 
#create new dataset with relative frequencies of authors across samples

authors_levels<-distinct(reading_lists, AUTHOR, ID, .keep_all = TRUE) #making sure there are no repeats
authors_levels$count<-as.numeric(ave(authors_levels$AUTHOR, authors_levels$AUTHOR, FUN = length)) #getting counts
authors_levels<-authors_levels %>% mutate(percent = (count/length(unique(authors_levels$ID)))*100)


#CREATE TABLE 1

t<-authors_levels %>% filter(percent>=50) %>% group_by(ID) %>% summarize (CANON_AUTHORS= n()) %>% mutate(SIZE=list_size$SIZE)%>% mutate(DOMINANCE=(CANON_AUTHORS/SIZE)*100)

t2<-authors_levels %>% filter(percent<=5) %>% group_by(ID) %>% summarize (RARE= n()) #counts for rare authors
t<-t %>% group_by(ID) %>% left_join(t2, by = "ID")
t[is.na(t)] <- 0  #some lists have no rare authors
TABLE_1<-t %>% mutate(RARE=(RARE/SIZE)*100)  #realtive freq of rare aut.

#FIGURE 2

library(ggplot2)
theme_set(theme_bw())  # pre-set the bw theme.

# Scatterplot
gg <- ggplot(TABLE_1, aes(x=ID, y=SIZE)) + 
  geom_point(aes(size=DOMINANCE), alpha = 0.3, color='blue') + 
  #  geom_smooth(method="loess", se=F) + 
  xlim(c(0, 95)) + 
  ylim(c(0, 355)) + 
  labs(subtitle="Sample size vs Canonical percent", 
       y="Sample size", 
       x="Sample ID", 
       title="Canonical Dominance"
  )

plot(gg)

#Shannon/Pielou indexes for Table 2

t<-authors_levels %>% filter(percent>=50) %>% group_by(ID) %>% summarize(CANON_AUTHORS=n())  #get count for canon authors


t2<-authors_levels %>% filter(percent<=5) %>% group_by(ID) %>% summarize(RARE=n())  #get count for rare authors

t<-t %>% group_by(ID) %>% left_join(t2, by = "ID")  

t[is.na(t)] <- 0

t<-t %>% group_by(ID) %>% left_join(list_size, by = "ID")
#colnames(t)<-c("ID","CANON_AUTHORS", "RARE", "SIZE") if needed

t<-t %>% mutate(OTHERS=SIZE-(CANON_AUTHORS+RARE))  #create a third group for 'other' authors

t2<-t[,-c(1,4)]  #get rid of cols not needed for next step-will bring back later

H<-diversity(t2, index = "shannon")   #Using Vegan package to get Shannon index


#Create TABLE 2

TABLE_2<-as.data.frame(H)  #add shannon results 
colnames(TABLE_2)<-"SHANNON"
TABLE_2["ID"]<-t$ID  #bring back deleted columns
TABLE_2["SIZE"]<-t$SIZE  #bring back deleted columns

#Calculate Pielou's evenness (J)
## Species richness (S) and Pielou's evenness (J):
S <- specnumber(t2) ## rowSums(t2 > 0) does the same...
J <- H/log(S)
TABLE_2["EVENNESS"]<-J   #adding final col for table 2

#correlation shannon/dominance

cor.test(TABLE_1$DOMINANCE,TABLE_2$SHANNON)

#optional
#plot(TABLE_1$DOMINANCE,TABLE_2$SHANNON)



##########jaccard distance/comparing lists


#getting samples we need according to ID
change<-authors_levels %>% filter(ID %in% c(19, 81,20, 21, 28, 29, 39, 91, 75, 59, 87,88, 89))

#next two lines create  data frame with 1 and 0s--might give warnings because it is "coercing argument of type 'double' to logical"--ignore warnings
df_change <- dcast(change, ID ~ AUTHOR, fun.aggregate = any, value.var = "percent")
df_change<-df_change %>% mutate_all(~replace(., is.na(.), 1))

#identifying institutions/samples with letters/dates
group_names<-c("A-2015","B-2017","B-2009","C-2016","C-2018","D-post2018","E-2020","E-2008","A-2007","F-2013-2015","F-2016-18","F-2019-2021","D-pre2018")

row.names(df_change)<-group_names
df_change<-df_change[,-1] #do not need ID 

# We are using the jaccard function of "proxy" library, but "binary" method of R's dist() can also be used as it is the equivalent of jaccard
# for example: dist(df_change,method="binary")

jac.dist<-dist(df_change, by_rows = TRUE, method = "Jaccard")

#FIGURE 3
#dendogram

dd<-hclust(jac.dist, method = "ward.D2") 
op = par(bg = "#F9F9F9")
plot(dd, col = "#473d34", col.main = "#7C8071", col.lab = "#7C8071",
     col.axis = "#473d34", lwd = 1, lty = 1, sub = '', xlab = "Jaccard difference",  main= "Updated Reading Lists")




#####TABLE 4 info comes from  tables 1 and 2


TABLE_4<-TABLE_2 %>% filter(ID %in% c(19, 81,20, 21, 28, 29, 39, 91, 75, 59, 87,88, 89))  
t3<-gender %>% filter(ID %in% c(19, 81,20, 21, 28, 29, 39, 91, 75, 59, 87,88, 89)) %>% select(ID,Residuals)
TABLE_4<-TABLE_4 %>% group_by(ID) %>% left_join(t3, by = "ID")
t4<-TABLE_1 %>% filter(ID %in% c(19, 81,20, 21, 28, 29, 39, 91, 75, 59, 87,88, 89)) %>% select(ID,DOMINANCE)
TABLE_4<-TABLE_4 %>% group_by(ID) %>% left_join(t4, by = "ID")
row.names(TABLE_4)<-group_names  #using the names from Jaccard section


